import pandas as pd
import os
import re

# 1. Read your Domain List which includes 'Domain' and corresponding 'Media' columns
domain_df = pd.read_excel("Domain_List.xlsx", usecols=["Domain", "Media"])

# Normalize domain names to lowercase for matching
domain_df["Domain_clean"] = domain_df["Domain"].str.lower().str.strip()

# 2. Read the main data file with all columns (including 'Domain')
both_df = pd.read_excel("Both Data with Domains.xlsx", dtype=str)
both_df["Domain_clean"] = both_df["Domain"].str.lower().str.strip()

# 3. Prepare output directory
output_dir = "Filtered_By_Media"
os.makedirs(output_dir, exist_ok=True)


# 4. Helper to sanitize Media names into safe filenames
def make_safe_filename(name):
    # Replace spaces with underscores, remove characters not alphanumeric or underscore
    safe = re.sub(r"[^\w\-]", "_", name)
    return safe[:50]  # limit length


# 5. Loop through each domain & media pair, filter and save
for _, row in domain_df.iterrows():
    domain = row["Domain_clean"]
    media = row["Media"]
    safe_media = make_safe_filename(media)

    # Filter rows matching this domain
    df_filtered = both_df[both_df["Domain_clean"] == domain].drop(columns=["Domain_clean"])

    if not df_filtered.empty:
        out_path = os.path.join(output_dir, f"{safe_media}.xlsx")
        df_filtered.to_excel(out_path, index=False)
        print(f"Wrote {len(df_filtered)} records for media '{media}' to {out_path}")
    else:
        print(f"No records found for domain '{domain}' (media: '{media}').")

print("✅ All filtered files have been generated.")
